import xml.sax, sys
from xml.sax.handler import *

class SentencesExtractor(ContentHandler):
    def __init__(self):
        self._sentences = []
        self._words = []
        self._characters = u''
        self._current_sent_nb = None
        
    def startElement(self, name, attrs):
        if name == "SENT":
            self._current_sent_nb = attrs.get("nb")
        
    def endElement(self, name):
        if name=="SENT":
            sent = " ".join(self._words)
            self._sentences += [(sent, self._current_sent_nb)]
            self._words = []
        if name=="w":
            word = self._characters.strip()
            self._words += [word]
            self._characters = ""
        
    def characters(self, ch):
        self._characters += ch

    def get_sentences(self):
        return self._sentences


def XML_to_sentences(xmlstring):
    handler = SentencesExtractor()
    print "type xmlstring %s"%type(xmlstring)
    xml.sax.parseString(xmlstring.encode("latin-1"), handler)
    return handler.get_sentences()

